# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
## Loading required package: librarian
librarian::shelf(
dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr, geojsonio, tidyverse)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
## Warning: multiple methods tables found for 'crop'
## Warning: multiple methods tables found for 'extend'
select <- dplyr::select # overwrite raster::select
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F)
I have chosen the giraffe, Giraffa, as my species for this lab. I have always loved large mammals and giraffes are just especially majestic and interesting animals.
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- TRUE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Giraffa',
from = 'gbif', has_coords = T, limit = 10000))
# extract data frame from result
#df <- res$gbif$data[[1]] # from Ben
df <- res$gbif$data[[1]] %>%
filter(longitude > -26.41,
longitude < 54.98,
latitude > -36.21,
latitude < 38.65) # filter for just the African continent
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9379
# show points on map
mapview::mapview(obs, map.types = "Stamen.Terrain")
Question 1. How many observations total are in GBIF for your species? (Hint: ?occ) There are 9,225 giraffe observations in GBIF, filtering for observations on the African continent only. Before filtering, there were 9,300 observations total in GBIF.
Question 2. Do you see any odd observations, like marine species on land or vice versa? If so, please see the Data Cleaning and explain what you did to fix or remove these points. There are no odd observations, and all of the points are on land; however, I did decide to filter my observations to those appearing only on the African continent. The observations were most abundant here to start with anyways, and since they are native to Africa I felt that it was a good idea to build the species distribution model based on their native habitat. To filter the observations, I included observations that fell within a bounding box of the African continent.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_climaticMoistureIndex", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
Question: What environmental layers did you choose as predictors? Can you find any support for these in the literature? I decided to use the following layers as predictors: WC_alt, WC_bio1, WC_bio2, ER_topoWet, and ER_climaticMoistureIndex. Because giraffes are typically found at low altitudes, in arid environments like grasslands and savannas, I thought the climatic moisture index and altitude layer were important to include. Additionally, I wanted to include WC_bio1 and WC_bio2 to see if temperature played a role in determining the giraffe habitat since they prefer to live in warmer envrieonments. Lastly, I included the ER_topoWet layer because giraffes typically prefer dry climates and I thought this layer would be a good predictor.
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Error 6: DeleteLayer() not supported by
## this dataset.
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
## Rows: 18758 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): ID, present, WC_alt, WC_bio1, WC_bio2, ER_climaticMoistureIndex, ER...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
## Warning: Removed 163 rows containing non-finite values (stat_density).
librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 18758
datatable(pts_env, rownames = F)
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 31 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 31 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 31 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 31 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_point).
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 18723
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9515 -0.3238 0.1843 0.3135 1.2728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.460e-01 4.258e-02 -8.126 4.73e-16 ***
## WC_alt 2.153e-04 9.853e-06 21.847 < 2e-16 ***
## WC_bio1 3.160e-02 1.423e-03 22.203 < 2e-16 ***
## WC_bio2 -1.154e-04 2.009e-03 -0.057 0.954
## ER_climaticMoistureIndex -2.522e-01 1.591e-02 -15.855 < 2e-16 ***
## ER_topoWet -4.112e-02 2.790e-03 -14.736 < 2e-16 ***
## lon 1.261e-02 2.770e-04 45.522 < 2e-16 ***
## lat -1.151e-02 2.216e-04 -51.920 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4087 on 18715 degrees of freedom
## Multiple R-squared: 0.3321, Adjusted R-squared: 0.3318
## F-statistic: 1329 on 7 and 18715 DF, p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.4698451 0.9514745
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3128 -0.6827 -0.0385 0.8209 3.2534
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.444e+00 3.355e-01 -25.171 <2e-16 ***
## WC_alt 1.705e-03 7.307e-05 23.333 <2e-16 ***
## WC_bio1 3.259e-01 1.337e-02 24.383 <2e-16 ***
## WC_bio2 -6.694e-03 1.221e-02 -0.548 0.583
## ER_climaticMoistureIndex -1.572e+00 1.043e-01 -15.073 <2e-16 ***
## ER_topoWet -2.909e-01 1.773e-02 -16.411 <2e-16 ***
## lon 8.396e-02 2.089e-03 40.189 <2e-16 ***
## lat -8.174e-02 2.032e-03 -40.223 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25956 on 18722 degrees of freedom
## Residual deviance: 18280 on 18715 degrees of freedom
## AIC: 18296
##
## Number of Fisher Scoring iterations: 5
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.0004095163 0.9310590901
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim = "free")
librarian::shelf(mgcv)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_climaticMoistureIndex) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_climaticMoistureIndex) +
## s(ER_topoWet) + s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6404 0.1617 -10.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.267 8.828 223.2 <2e-16 ***
## s(WC_bio1) 8.894 8.988 378.6 <2e-16 ***
## s(WC_bio2) 8.787 8.981 275.2 <2e-16 ***
## s(ER_climaticMoistureIndex) 6.951 7.638 630.1 <2e-16 ***
## s(ER_topoWet) 8.822 8.988 184.1 <2e-16 ***
## s(lon) 8.995 9.000 492.9 <2e-16 ***
## s(lat) 8.817 8.985 1243.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.714 Deviance explained = 65.5%
## UBRE = -0.51533 Scale est. = 1 n = 18723
# show term plots
plot(mdl, scale=0)
Question: Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?
Altitude above about 2000, Bio_1 above about 26, Bio_2 above about 17, ER_topo_Wet under 9, and latitude from about -40 to -20 seem to contribute most towards presence. On the flip side, Bio_1 from about 0-20, ER_climaticMoistureIndex from -1 and above, ER_topoWet above 14, and longitude froom about -7 to 0 seem to contribute the most to absense.
# load extra packages
librarian::shelf(
maptools, sf)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)
# plot term plots
response(mdl)
Question: Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?
ER_climaticMoistureIndex and WC_bio1 seem to contribute the most towards giraffe presence in the maxent model. This is different from the GAM results because while it seemed like ER_climaticMoistureIndex did influence presence, it was not as obvious how much it contributed to presence. In the term plots above we can see that giraffe presence is heavily influenced by ER_climaticMoistureIndex hovering right around 0. Additionally, WC_bio1 heavily contributed to giraffe presence for values around 20, peaking around 21.
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
# Lab 1c. Species Distribution Modeling - Decision Trees
# global knitr chunk options
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE)
# load packages
librarian::shelf(
caret, # m: modeling framework
dplyr, ggplot2 ,here, readr,
pdp, # X: partial dependence plots
rpart, # m: recursive partition modeling
rpart.plot, # m: recursive partition plotting
rsample, # d: split train/test data
skimr, # d: skim summarize data table
vip) # X: variable importance
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# options
options(
scipen = 999,
readr.show_col_types = F)
set.seed(42)
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 18723 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 9377, 1: 9346 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 815.31 | 515.32 | -415.00 | 371.00 | 734.00 | 1209.00 | 3463.00 | ▅▇▅▁▁ |
| WC_bio1 | 0 | 1 | 22.84 | 3.46 | 5.90 | 21.00 | 22.20 | 25.20 | 30.60 | ▁▁▃▇▃ |
| WC_bio2 | 0 | 1 | 13.95 | 2.27 | 5.50 | 12.60 | 14.00 | 15.80 | 19.60 | ▁▂▇▇▂ |
| ER_climaticMoistureIndex | 0 | 1 | -0.58 | 0.29 | -1.00 | -0.79 | -0.61 | -0.34 | 0.60 | ▇▆▅▁▁ |
| ER_topoWet | 0 | 1 | 12.36 | 1.39 | 7.30 | 11.27 | 12.43 | 13.23 | 15.82 | ▁▂▇▇▂ |
| lon | 0 | 1 | 24.24 | 13.03 | -16.54 | 16.29 | 29.12 | 34.30 | 40.58 | ▁▂▂▃▇ |
| lat | 0 | 1 | -0.47 | 16.42 | -34.16 | -13.18 | -0.62 | 11.96 | 38.04 | ▃▃▇▃▂ |
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 9377 9346
table(d_train$present)
##
## 0 1
## 7501 7476
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 14977
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14977 7476 0 (0.5008346 0.4991654)
## 2) lat>=2.707017 5411 636 0 (0.8824617 0.1175383) *
## 3) lat< 2.707017 9566 2726 1 (0.2849676 0.7150324) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
## 2.2 Partition, depth=default
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 14977
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14977 7476 0 (0.50083461 0.49916539)
## 2) lat>=2.707017 5411 636 0 (0.88246165 0.11753835)
## 4) WC_bio1< 28.75 4543 175 0 (0.96147920 0.03852080) *
## 5) WC_bio1>=28.75 868 407 1 (0.46889401 0.53110599)
## 10) lat>=13.61245 367 12 0 (0.96730245 0.03269755) *
## 11) lat< 13.61245 501 52 1 (0.10379242 0.89620758) *
## 3) lat< 2.707017 9566 2726 1 (0.28496759 0.71503241)
## 6) lon< 30.79615 3526 1455 0 (0.58735111 0.41264889)
## 12) lat>=-15.13977 1237 79 0 (0.93613581 0.06386419) *
## 13) lat< -15.13977 2289 913 1 (0.39886413 0.60113587)
## 26) WC_bio1< 21.45 1331 622 0 (0.53268219 0.46731781)
## 52) ER_climaticMoistureIndex< -0.705 691 231 0 (0.66570188 0.33429812) *
## 53) ER_climaticMoistureIndex>=-0.705 640 249 1 (0.38906250 0.61093750) *
## 27) WC_bio1>=21.45 958 204 1 (0.21294363 0.78705637) *
## 7) lon>=30.79615 6040 655 1 (0.10844371 0.89155629) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.55029428 0 1.0000000 1.0279561 0.008182077
## 2 0.08239700 1 0.4497057 0.4547887 0.006857341
## 3 0.06193151 2 0.3673087 0.3673087 0.006334313
## 4 0.02655163 3 0.3053772 0.3060460 0.005889250
## 5 0.01531568 5 0.2522739 0.2546816 0.005453049
## 6 0.01000000 7 0.2216426 0.2352862 0.005270285
Question: Based on the complexity plot threshold, what size of tree is recommended?
A size of 8 is recommended based on the complexity plot threshold.
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
Question: what are the top 3 most important variables of your model?
Based on the plot above, lat, lon, and ER_climaticMoistureIndex are the most important variables of my model.
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
# load additional packages
librarian::shelf(
ranger) # random forest modeling
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1810624
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
Question: How might variable importance differ between rpart and RandomForest in your model outputs?
Besides lat and lon, WC_bio1 and ER_climaticMoistureIndex are the two most important variables in both the rpart model and the random forest model. The bottom three variables are also the same in both models, with ER_topoWet being the least important variable.
# global knitr chunk options
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE)
# load packages
librarian::shelf(
dismo, # species distribution modeling: maxent(), predict(), evaluate(),
dplyr, ggplot2, GGally, here, maptools, readr,
raster, readr, rsample, sf,
usdm) # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select
# options
set.seed(42)
options(
scipen = 999,
readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_geo <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds <- file.path(dir_data, "mdl_maxent_vif.rds")
# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)
# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)
# create training set with 80% of full data
pts_split <- rsample::initial_split(
pts, prop = 0.8, strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_alt 2.337270
## 2 WC_bio1 2.043901
## 3 WC_bio2 2.682166
## 4 ER_climaticMoistureIndex 2.494296
## 5 ER_topoWet 1.686271
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7)
v
## 1 variables from the 5 input variables have collinearity problem:
##
## WC_bio2
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( ER_climaticMoistureIndex ~ WC_bio1 ): -0.07796855
## max correlation ( WC_bio1 ~ WC_alt ): -0.6385333
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_alt 1.776481
## 2 WC_bio1 1.902871
## 3 ER_climaticMoistureIndex 1.235929
## 4 ER_topoWet 1.564684
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)
# plot variable contributions per predictor
plot(mdl_maxv)
Question: Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model?
None of my variables had colliniarity problems, so none were removed. ER_climaticMoistureIndex was the most important, followed by WC_bio1, WC_alt, ER_topoWet, and WC_bio2. The bottom three variables all just barely contributed.
# plot term plots
response(mdl_maxv)
# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')
plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 1869
## n absences : 1875
## AUC : 0.8478151
## cor : 0.6003439
## max TPR+TNR at : 0.6426489
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6426489
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
matrix(
c(tpr, fnr,
fpr, tnr),
nrow=2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.92455859 0.3061333
## absent_obs 0.07544141 0.6938667
# add point to ROC plot
plot(e, 'ROC')
points(fpr, tpr, pch=23, bg="blue")
plot(y_maxv > thr)